import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import plotly.offline as py
C:\Users\HP\AppData\Local\Temp\ipykernel_5464\3042834768.py:5: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead. from pandas_profiling import ProfileReport
# Loading the dataset.
data = pd.read_csv(r"C:\Users\HP\Desktop\Projects File\Customer Churn\WA_Fn-UseC_-Telco-Customer-Churn.csv")
data
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 6840-RESVB | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | ... | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No |
| 7039 | 2234-XADUH | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | ... | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No |
| 7040 | 4801-JZAZL | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
| 7041 | 8361-LTMKD | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
| 7042 | 3186-AJIEK | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | ... | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
7043 rows × 21 columns
#Pandas Profiling
profile = ProfileReport(data)
profile
# profile.to_file(output_file='report.html')
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
# Printing out the info of our columns section to understand a brief summary of dataset.
#This basically does the samething as the pandas_profiling
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.1+ MB
#Printing out the shape and size of our dataset.
data.shape
(7043, 21)
# Convert Churn column. This is to help the machine learning model understand our data.
data['Churn'] = data['Churn'].replace({'Yes': 1, 'No': 0})
print(data['Churn'].head())
0 0 1 0 2 1 3 0 4 1 Name: Churn, dtype: int64
#Converting "No internet Service" for the following to "No", ( OnlineSecurity,StreamingTV,
# DeviceProtection,TechSupport,OnlineSecurity,StreamingTV)
cols = ['OnlineSecurity','StreamingTV','DeviceProtection','TechSupport','OnlineBackup','StreamingMovies','MultipleLines']
for col in cols:
data.loc[data[col] == 'No internet service', col] = 'No'
print(data[cols].head())
OnlineSecurity StreamingTV DeviceProtection TechSupport OnlineBackup \ 0 No No No No Yes 1 Yes No Yes No No 2 Yes No No No Yes 3 Yes No Yes Yes No 4 No No No No No StreamingMovies MultipleLines 0 No No phone service 1 No No 2 No No 3 No No phone service 4 No No
data.loc[data['MultipleLines'] == 'No phone service', 'MultipleLines'] = 'No'
print(data['MultipleLines'].unique())
['No' 'Yes']
# Replace spaces with NaN
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan)
# Drop rows with NaN in TotalCharges
data.dropna(subset=['TotalCharges'], inplace=True)
# Converting Total charge to float
data['TotalCharges'] = data['TotalCharges'].astype('float32')
churn_counts = data['Churn'].value_counts()
churn_counts
0 5163 1 1869 Name: Churn, dtype: int64
# visualizing our data
import plotly.express as px
churn_counts = data['Churn'].value_counts()
fig = px.pie(values=churn_counts.values,
names=churn_counts.index,
title='Percentage of Churn Customers',
color_discrete_sequence=['lightgrey', 'red'])
fig.show()
From the above we can see that, only 26% of our entire dataset churned.
# Churn rate by Gender.
# fig = px.pie(data, values='Churn', names='gender',
# title='Churn Percentage by Gender')
# fig.update_traces(textinfo='percent+label')
# fig.show()
import plotly.express as px
fig = px.histogram(data, x='gender', color='Churn',
labels={'gender': 'Gender'},
color_discrete_sequence=['seagreen', 'lightsalmon'],
title='Gender vs Churn')
fig.update_layout(xaxis_title='Gender',
yaxis_title='Count',
legend_title='Churn',
legend_orientation='h')
for i in range(len(fig.data)):
fig.data[i].name = 'No' if i==0 else 'Yes'
fig.show()
Here we could see that women was slightly likely to churn than men
colors = ['rebeccapurple', 'seagreen']
fig = px.bar(data, x='TechSupport', y='Churn',
title='Churn Rate by Tech Support Service',
color_discrete_sequence=colors)
fig.update_yaxes(tickformat='.0%')
fig.show()
People with "No" tech support is bigger had much higher churn rate than people with "Yes" Techsupport
# We are looking at Churn rate by internet services.
import plotly.express as px
fig = px.bar(data, x='InternetService', y='Churn',
title='Churn Rate by Internet Service')
fig.update_yaxes(tickformat='.0%')
fig.show()
People with fibre optics are more likely to churn.
#Churn rate by payment method
colors = ['lightslategray', 'red', 'seagreen']
fig = px.bar(data, x='PaymentMethod', y='Churn',
color_discrete_sequence=colors,
title='Churn Rate by Payment Method')
fig.update_yaxes(tickformat='.0%')
fig.show()
#Churn rate by Contract
import plotly.express as px
fig = px.bar(data, x='Contract', y='Churn',
title='Churn Rate by Contract Duration')
fig.update_yaxes(tickformat='.0%')
fig.show()
People with longer duration contract were less likely to churn.
import pandas as pd
from sklearn.preprocessing import LabelEncoder
cat_cols = ["gender","Partner","Dependents","PhoneService",
"MultipleLines","InternetService","OnlineSecurity",
"OnlineBackup","DeviceProtection","TechSupport",
"StreamingTV","StreamingMovies","Contract",
"PaperlessBilling","PaymentMethod","Churn"]
for col in cat_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
print(data[cat_cols].head())
gender Partner Dependents PhoneService MultipleLines InternetService \ 0 0 1 0 0 0 0 1 1 0 0 1 0 0 2 1 0 0 1 0 0 3 1 0 0 0 0 0 4 0 0 0 1 0 1 OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV \ 0 0 1 0 0 0 1 1 0 1 0 0 2 1 1 0 0 0 3 1 0 1 1 0 4 0 0 0 0 0 StreamingMovies Contract PaperlessBilling PaymentMethod Churn 0 0 0 1 2 0 1 0 1 0 3 0 2 0 0 1 3 1 3 0 1 0 0 0 4 0 0 1 2 1
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Load data
# df = pd.read_csv('telco_churn.csv')
# Define columns to scale
cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']
# Create scaler
scaler = MinMaxScaler()
# Fit and transform selected columns
data[cols_to_scale] = scaler.fit_transform(data[cols_to_scale])
# Verify scaled values
print(data[cols_to_scale].head())
tenure MonthlyCharges TotalCharges 0 0.000000 0.115423 0.001275 1 0.464789 0.385075 0.215867 2 0.014085 0.354229 0.010310 3 0.619718 0.239303 0.210241 4 0.014085 0.521891 0.015330
data.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | 0 | 0 | 1 | 0 | 0.000000 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0.115423 | 0.001275 | 0 |
| 1 | 5575-GNVDE | 1 | 0 | 0 | 0 | 0.464789 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 3 | 0.385075 | 0.215867 | 0 |
| 2 | 3668-QPYBK | 1 | 0 | 0 | 0 | 0.014085 | 1 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0.354229 | 0.010310 | 1 |
| 3 | 7795-CFOCW | 1 | 0 | 0 | 0 | 0.619718 | 0 | 0 | 0 | 1 | ... | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0.239303 | 0.210241 | 0 |
| 4 | 9237-HQITU | 0 | 0 | 0 | 0 | 0.014085 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0.521891 | 0.015330 | 1 |
5 rows × 21 columns
data.columns
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
dtype='object')
#Features variables for x and y
y = data['Churn']
X = data.drop(columns=['Churn','customerID'])
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(4711, 19) (4711,) (2321, 19) (2321,)
# Import performance metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# List of models
models = [DecisionTreeClassifier(), LogisticRegression(), RandomForestClassifier(), GaussianNB()]
for model in models:
# Fit model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate performance
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
precision recall f1-score support
0 0.81 0.80 0.81 1549
1 0.47 0.49 0.48 561
accuracy 0.72 2110
macro avg 0.64 0.65 0.64 2110
weighted avg 0.72 0.72 0.72 2110
[[1241 308]
[ 285 276]]
0.718957345971564
precision recall f1-score support
0 0.84 0.90 0.87 1549
1 0.64 0.52 0.57 561
accuracy 0.80 2110
macro avg 0.74 0.71 0.72 2110
weighted avg 0.79 0.80 0.79 2110
[[1388 161]
[ 270 291]]
0.795734597156398
precision recall f1-score support
0 0.83 0.89 0.86 1549
1 0.63 0.49 0.55 561
accuracy 0.79 2110
macro avg 0.73 0.69 0.70 2110
weighted avg 0.77 0.79 0.78 2110
[[1386 163]
[ 288 273]]
0.7862559241706161
precision recall f1-score support
0 0.88 0.76 0.82 1549
1 0.52 0.72 0.61 561
accuracy 0.75 2110
macro avg 0.70 0.74 0.71 2110
weighted avg 0.79 0.75 0.76 2110
[[1184 365]
[ 159 402]]
0.7516587677725118
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Best params from tuning
param_grid = {
'n_estimators': [100, 200],
'max_features': ['sqrt'],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2]
}
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
grid = GridSearchCV(rf, param_grid, cv=5)
grid.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
param_grid={'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
'min_samples_split': [2, 5],
'n_estimators': [100, 200]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=RandomForestClassifier(),
param_grid={'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
'min_samples_split': [2, 5],
'n_estimators': [100, 200]})RandomForestClassifier()
RandomForestClassifier()
best_params = grid.best_params_
rf = RandomForestClassifier(**best_params)
rf.fit(X_train, y_train)
RandomForestClassifier(min_samples_split=5, n_estimators=200)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(min_samples_split=5, n_estimators=200)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv=5)
print("Cross Validation Accuracy:", np.mean(scores))
Cross Validation Accuracy: 0.7978498617473485
y_pred = rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)
print("Cross Validation Accuracy:", np.mean(scores))
Test Accuracy: 0.7933649289099526 Cross Validation Accuracy: 0.7978498617473485